{
 "cells": [
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 制作干净的数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd \n",
    "from pathlib import Path\n",
    "import os \n",
    "import shutil\n",
    "import imghdr\n",
    "from glob import glob\n",
    "from tqdm import tqdm "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>image_path</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>bigdata/image_data\\test-0.jpg</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>bigdata/image_data\\test-1.jpg</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>bigdata/image_data\\test-10.jpg</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                       image_path\n",
       "0   bigdata/image_data\\test-0.jpg\n",
       "1   bigdata/image_data\\test-1.jpg\n",
       "2  bigdata/image_data\\test-10.jpg"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "image_path_df = pd.read_csv(\"bigdata/temp_dir/temp_image_url.csv\")\n",
    "image_path_df.head(3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>text</th>\n",
       "      <th>url</th>\n",
       "      <th>filename</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>欧美夏季ebay连衣裙 气质圆领通勤绑带收腰连衣裙 zc3730</td>\n",
       "      <td>https://gimg2.baidu.com/image_search/src=http%...</td>\n",
       "      <td>test-0.jpg</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>曾是名不见经传的王平,为何能够取代魏延,成为蜀汉</td>\n",
       "      <td>https://pic.rmb.bdstatic.com/19539b3b1a7e1daee...</td>\n",
       "      <td>test-1.jpg</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>女童黄色连衣裙</td>\n",
       "      <td>https://gimg2.baidu.com/image_search/src=http%...</td>\n",
       "      <td>test-2.jpg</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                               text  \\\n",
       "0  欧美夏季ebay连衣裙 气质圆领通勤绑带收腰连衣裙 zc3730   \n",
       "1          曾是名不见经传的王平,为何能够取代魏延,成为蜀汉   \n",
       "2                           女童黄色连衣裙   \n",
       "\n",
       "                                                 url    filename  \n",
       "0  https://gimg2.baidu.com/image_search/src=http%...  test-0.jpg  \n",
       "1  https://pic.rmb.bdstatic.com/19539b3b1a7e1daee...  test-1.jpg  \n",
       "2  https://gimg2.baidu.com/image_search/src=http%...  test-2.jpg  "
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_test_data = pd.concat([pd.read_csv(\"bigdata/raw_data/test-2.6w.csv\"), pd.read_csv(\"bigdata/raw_data/train-137w.csv\")]).reset_index(drop=True)\n",
    "train_test_data.head(3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>image_path</th>\n",
       "      <th>text</th>\n",
       "      <th>train_or_test</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>bigdata/image_data/test-0.jpg</td>\n",
       "      <td>欧美夏季ebay连衣裙 气质圆领通勤绑带收腰连衣裙 zc3730</td>\n",
       "      <td>test</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>bigdata/image_data/test-1.jpg</td>\n",
       "      <td>曾是名不见经传的王平,为何能够取代魏延,成为蜀汉</td>\n",
       "      <td>test</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>bigdata/image_data/test-10.jpg</td>\n",
       "      <td>考勤加班明细表</td>\n",
       "      <td>test</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                       image_path                              text  \\\n",
       "0   bigdata/image_data/test-0.jpg  欧美夏季ebay连衣裙 气质圆领通勤绑带收腰连衣裙 zc3730   \n",
       "1   bigdata/image_data/test-1.jpg          曾是名不见经传的王平,为何能够取代魏延,成为蜀汉   \n",
       "2  bigdata/image_data/test-10.jpg                           考勤加班明细表   \n",
       "\n",
       "  train_or_test  \n",
       "0          test  \n",
       "1          test  \n",
       "2          test  "
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "clean_train_test_df = image_path_df.pipe(\n",
    "    lambda x: x.assign(**{\n",
    "        'filename':x['image_path'].apply(lambda j: Path(j).name)\n",
    "    })\n",
    ").pipe(\n",
    "    lambda x: x.merge(\n",
    "        right=train_test_data,\n",
    "        how='left',\n",
    "        left_on=['filename'],\n",
    "        right_on=['filename']\n",
    "    )\n",
    ")[['image_path', 'text']].pipe(\n",
    "    lambda x: x.assign(**{\n",
    "        'train_or_test':x['image_path'].apply(\n",
    "            lambda j: 'train' if j.find('train') != -1 else 'test'\n",
    "        )\n",
    "    })\n",
    ").pipe(\n",
    "    lambda x: x.assign(**{\n",
    "    'image_path':x['image_path'].apply(lambda j : j.replace('\\\\', '/'))\n",
    "    })\n",
    ")\n",
    "\n",
    "clean_train_test_df.head(3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "clean_train_test_dir = \"bigdata/clean_train_test\"\n",
    "if Path(clean_train_test_dir).exists():\n",
    "    shutil.rmtree(clean_train_test_dir, ignore_errors=True)\n",
    "\n",
    "os.makedirs(clean_train_test_dir)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "clean_train_test_df.query('train_or_test == \"test\"').drop(columns=['train_or_test']).to_csv(Path(clean_train_test_dir).joinpath(\"test.csv\"), index=False)\n",
    "clean_train_test_df.query('train_or_test == \"train\"').drop(columns=['train_or_test']).to_csv(Path(clean_train_test_dir).joinpath(\"train.csv\"), index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "clean_train_test_df.query('train_or_test == \"test\"').drop(columns=['train_or_test']).sample(200).to_csv(Path(clean_train_test_dir).joinpath(\"test_small.csv\"), index=False)\n",
    "clean_train_test_df.query('train_or_test == \"train\"').drop(columns=['train_or_test']).sample(1000).to_csv(Path(clean_train_test_dir).joinpath(\"train_small.csv\"), index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "24.0"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "clean_train_test_df.query('train_or_test == \"train\"').sample(20000).pipe(\n",
    "    lambda x:x['text'].apply(lambda j: len(j))\n",
    ").median()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "# clean_train_test_df.query('train_or_test == \"train\"')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "mynet2",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.8"
  },
  "orig_nbformat": 4,
  "vscode": {
   "interpreter": {
    "hash": "110bc624a448454d574a0cd6cc76359fd86f75739e493913b3d71c2e04f2ffdb"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
